# Computations
import numpy as np
import pandas as pd
import pickle
# Visualisation libraries
## Text
from colorama import Fore, Back, Style
from IPython.display import Image, display, Markdown, Latex, clear_output
## progressbar
import progressbar
## plotly
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.offline as py
from plotly.subplots import make_subplots
import plotly.express as px
## seaborn
import seaborn as sns
## matplotlib
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse, Polygon
from matplotlib.font_manager import FontProperties
import matplotlib.colors as mcolors
plt.style.use('seaborn-whitegrid')
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['text.color'] = 'k'
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
In this article, we analyze the UCI Statlog (german credit data) from Kaggle.com.
The original dataset contains 1000 entries with 20 categorial/symbolic attributes prepared by Prof. Hofmann. In this dataset, each entry represents a person who takes a credit by a bank. Each person is classified as good or bad credit risks according to the set of attributes. The link to the original dataset can be found below.
It is almost impossible to understand the original dataset due to its complicated system of categories and symbols. Thus, I wrote a small Python script to convert it into a readable CSV file. Several columns are simply ignored, because in my opinion either they are not important or their descriptions are obscure. The selected attributes are:
Path = 'Statlog_Dataset/german_credit_data.csv'
def Header(Text, L = 100, C = 'Blue', T = 'White'):
BACK = {'Black': Back.BLACK, 'Red':Back.RED, 'Green':Back.GREEN, 'Yellow': Back.YELLOW, 'Blue': Back.BLUE,
'Magenta':Back.MAGENTA, 'Cyan': Back.CYAN}
FORE = {'Black': Fore.BLACK, 'Red':Fore.RED, 'Green':Fore.GREEN, 'Yellow':Fore.YELLOW, 'Blue':Fore.BLUE,
'Magenta':Fore.MAGENTA, 'Cyan':Fore.CYAN, 'White': Fore.WHITE}
print(BACK[C] + FORE[T] + Style.NORMAL + Text + Style.RESET_ALL + ' ' + FORE[C] +
Style.NORMAL + (L- len(Text) - 1)*'=' + Style.RESET_ALL)
def Line(L=100, C = 'Blue'):
FORE = {'Black': Fore.BLACK, 'Red':Fore.RED, 'Green':Fore.GREEN, 'Yellow':Fore.YELLOW, 'Blue':Fore.BLUE,
'Magenta':Fore.MAGENTA, 'Cyan':Fore.CYAN, 'White': Fore.WHITE}
print(FORE[C] + Style.NORMAL + L*'=' + Style.RESET_ALL)
def Search_List(Key, List): return [s for s in List if Key in s]
Data = pd.read_csv(Path.split(".")[0]+'_Mod.csv')
Header('Standardized Dataset:')
display(Data.head())
display(pd.DataFrame({'Number of Instances': [Data.shape[0]], 'Number of Attributes': [Data.shape[1]]}).style.hide_index())
# Dictionaries
with open(Path.split(".")[0] + '_Feat_Dict.pkl', 'rb') as fp:
Feat_Dict = pickle.load(fp)
Target = 'Risk'
Labels = [x.title() for x in Data[Target].unique()]
def dtypes_group(Inp, Dict = False):
Temp = Inp.dtypes.to_frame(name='Data Type').sort_values(by=['Data Type'])
Out = pd.DataFrame(index =Temp['Data Type'].unique(), columns = ['Features','Count'])
for c in Temp['Data Type'].unique():
Out.loc[Out.index == c, 'Features'] = [Temp.loc[Temp['Data Type'] == c].index.tolist()]
Out.loc[Out.index == c, 'Count'] = len(Temp.loc[Temp['Data Type'] == c].index.tolist())
Out.index.name = 'Data Type'
Out = Out.reset_index(drop = False)
Out['Data Type'] = Out['Data Type'].astype(str)
if Dict:
Out = dict(zip(Out['Data Type'], Out['Features']))
return Out
dType = dtypes_group(Data, Dict = True)
Standardized Dataset: ==============================================================================
| Age | Sex | Job | Housing | Saving Accounts | Checking Account | Credit Amount | Duration | Purpose | Risk | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 67 | Male | 2 | Own | None | Little | 1169 | 6 | Radio/TV | Good |
| 1 | 22 | Female | 2 | Own | Little | Moderate | 5951 | 48 | Radio/TV | Bad |
| 2 | 49 | Male | 1 | Own | Little | None | 2096 | 12 | Education | Good |
| 3 | 45 | Male | 2 | Free | Little | Little | 7882 | 42 | Furniture/Equipment | Good |
| 4 | 53 | Male | 2 | Free | Little | Little | 4870 | 24 | Car | Bad |
| Number of Instances | Number of Attributes |
|---|---|
| 1000 | 10 |
def FeatAgg(Feat, ColorFeat, Target = Target, Inp = Data):
Out = Inp[[Feat, ColorFeat,Target]]
Out = Out.groupby([Feat, ColorFeat,Target])[Target].agg({'count'}).rename(columns = {'count':'Count'})
Out['Percentage'] = np.round(100* Out.values /Out.sum().values, 2)
Out.reset_index(drop = False, inplace = True)
Out = Out.sort_values(by=[Feat])
Out[Feat] = Out[Feat].astype(str)
return Out
def FeatBins(Inp, Bins, replace = True, String = True):
Bins = [int(x) for x in Bins]
Out = pd.cut(Inp, bins = pd.IntervalIndex.from_tuples([(x, y) for x, y in zip(Bins[:-1],Bins[1:])]))
if replace:
Out = Out.astype('str').replace(Dict)
if String:
Out = Out.astype('str')
try:
Out = Out.str.replace(pat = '(-1', repl = '[0')
except:
pass
return Out
def DistPlot(Feat, Target = Target, nbins = 20,
Colors = ['LightSalmon', 'LightBlue'], LC = 'Black',
yLim = [0, 80], H = 450, titleY = 0.92, Inp = Data):
fig = px.histogram(Inp, x = Feat, nbins=nbins, color= Target, marginal= 'box',
color_discrete_sequence= Colors, hover_data=Data.columns)
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
zeroline=False, zerolinewidth=1, zerolinecolor='Black',
showgrid=False, gridwidth=1, gridcolor='Lightgray')
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
zeroline=True, zerolinewidth=1, zerolinecolor='Black',
showgrid=True, gridwidth=1, gridcolor='Lightgray')
# Median
fig.add_trace(go.Scatter(x= Inp[Feat].median()* np.ones(int(yLim[1])), y= np.arange(int(yLim[0]), int(yLim[1])),
name="Median", line=dict(color='RoyalBlue', width=2, dash='dot')))
# Mean
fig.add_trace(go.Scatter(x= Inp[Feat].mean()* np.ones(int(yLim[1])), y= np.arange(int(yLim[0]), int(yLim[1])),
name="Mean", line=dict(color='Red', width=2, dash='dot')))
Name = '%s Distribution by %s' % (Target, Feat)
fig.update_layout(legend_orientation='v', plot_bgcolor= 'white', height= H, width= 980,
title={'text': '<b>' + Name + '<b>', 'x':0.5, 'y': titleY, 'xanchor': 'center', 'yanchor': 'top'},
yaxis_title='Frequency')
fig.update_traces(marker_line_color= LC, marker_line_width=0.5, opacity=1)
fig['layout']['yaxis'].update(range=yLim)
fig.show()
def DispPlot2(Feat, ColorFeat, xLim, yLim, Target = Target, nbins = 5, titleY = 0.90, LC = 'Black', H2 = 320, W = 980,
Colors1 = ['Pink', 'BlueViolet'], Colors2 = ['OrangeRed', 'LimeGreen'], Inp = Data):
# Top Figure
fig = make_subplots(rows=1, cols=2, horizontal_spacing = 0.02, shared_yaxes=True,
subplot_titles=('%s: <b>%s<b>' % (Target,Labels[0]), '%s: <b>%s<b>' % (Target,Labels[1])))
for j in range(2):
Temp = Inp.loc[Inp[Target] == Labels[j]].sort_values(by = ColorFeat)
if Colors1 == None:
figtemp = px.histogram(Temp, x= Feat, color = ColorFeat, nbins = nbins)
else:
figtemp = px.histogram(Temp, x= Feat, color = ColorFeat, nbins = nbins,
color_discrete_sequence = Colors1)
for i in range(len(figtemp['data'])):
fig.add_trace(figtemp['data'][i], row=1, col=j+1)
del figtemp, Temp
fig.update_traces(marker_line_color= LC, marker_line_width=1, opacity=1)
fig.update_yaxes(title_text ='Frequency', row=1, col=1)
fig.update_traces(showlegend = False, row=1, col=2)
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
zeroline=False, zerolinewidth=1, zerolinecolor='Black',
showgrid=False, gridwidth=1, gridcolor='Lightgray', range=xLim)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
zeroline=True, zerolinewidth=1, zerolinecolor='Black',
showgrid=True, gridwidth=1, gridcolor='Lightgray', range=yLim)
fig.update_layout(plot_bgcolor= 'white', barmode='stack', width = W)
Name = '%s Distribution by %s and %s' % (Feat, ColorFeat, Target)
fig.update_layout(title={'text': '<b>' + Name + '<b>', 'x':0.5, 'y': titleY, 'xanchor': 'center', 'yanchor': 'top'})
fig.show()
# Bottom Figure
fig = px.box(Data, x= Feat, y = ColorFeat, color = Target,
color_discrete_sequence= Colors2, hover_data=Data.columns)
fig.update_traces(quartilemethod='linear')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
zeroline=False, zerolinewidth=1, zerolinecolor='Black',
showgrid=False, gridwidth=1, gridcolor='Lightgray', range=xLim)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
zeroline=True, zerolinewidth=1, zerolinecolor='Black',
showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_layout(plot_bgcolor= 'white', barmode='stack', height= H2, width = W)
fig.show()
def PlotX(df, Feat, ColorFeat, Target = Target, Labels = Labels,
Colors = list(mcolors.TABLEAU_COLORS.values()), LC = 'Black',
yLim = [0, 35], H = 500, titleY = 0.90):
# Figure
fig = make_subplots(rows=1, cols=2, horizontal_spacing = 0.02, shared_yaxes=True, y_title = 'Percent',
subplot_titles=('%s: <b>%s<b>' % (Target, Labels[0]),
'%s: <b>%s<b>' % (Target, Labels[1])))
# Left
if Colors == None:
fig1 = px.bar(df.loc[df[Target] == Labels[0]], x= Feat, y= 'Percentage', orientation='v',
color = ColorFeat, text = 'Percentage', hover_data= df.columns)
else:
fig1 = px.bar(df.loc[df[Target] == Labels[0]], x= Feat, y= 'Percentage', orientation='v',
color = ColorFeat, text = 'Percentage', hover_data= df.columns,
color_discrete_sequence = Colors)
for i in range(len(fig1['data'])):
fig.add_trace(fig1['data'][i], row=1, col=1)
fig.update_traces(marker_line_color= LC, marker_line_width=1, opacity=1, row=1, col=1)
# Right
if Colors == None:
fig2 = px.bar(df.loc[df[Target] == Labels[1]], x= Feat, y= 'Percentage', orientation='v',
color = ColorFeat, text = 'Percentage', hover_data= df.columns)
else:
fig2 = px.bar(df.loc[df[Target] == Labels[1]], x= Feat, y= 'Percentage', orientation='v',
color = ColorFeat, text = 'Percentage', hover_data= df.columns,
color_discrete_sequence = Colors)
for i in range(len(fig2['data'])):
fig.add_trace(fig2['data'][i], row=1, col=2)
fig.update_traces(marker_line_color= LC, marker_line_width=1, opacity=1, showlegend = False, row=1, col=2)
# Update
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
zeroline=False, zerolinewidth=1, zerolinecolor='Black',
showgrid=False, gridwidth=1, gridcolor='Lightgray')
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
zeroline=False, zerolinewidth=1, zerolinecolor='Black',
showgrid=True, gridwidth=1, gridcolor='Lightgray', range= yLim)
fig.update_layout(legend_orientation='v', legend_title_text=ColorFeat, plot_bgcolor= 'white', height= H, width= 980)
fig.update_layout(legend=dict(font=dict(color="Black"), bordercolor="Lightgray", borderwidth=1))
Name = '%s Distribution by %s and %s' % (Target, Feat, ColorFeat)
fig.update_layout(title={'text': '<b>' + Name + '<b>', 'x':0.5, 'y': titleY, 'xanchor': 'center', 'yanchor': 'top'})
fig.show()
df = Data.copy()
for Feat in dType['int64']:
df[Feat] = FeatBins(Inp = Data[Feat], Bins = Feat_Dict[Feat], replace = False)
Feat = 'Age'
DistPlot(Feat = Feat, Colors = ['LightCoral', 'LimeGreen'], yLim = [0, 250], nbins = 20)
ColorFeat = 'Housing'
Table = FeatAgg(Feat, ColorFeat, Inp = df)
PlotX(Table, Feat, ColorFeat, Target = Target, yLim = [0, 25], Colors = ['RoyalBlue','ForestGreen','Tomato'])
ColorFeat = 'Saving Accounts'
Table = FeatAgg(Feat, ColorFeat, Inp = df).sort_values(by = [ColorFeat])
PlotX(Table, Feat, ColorFeat, Target = Target, yLim = [0, 20], Colors = ['YellowGreen','ForestGreen','Gray', 'Coral','Tomato'])
ColorFeat = 'Checking Account'
Table = FeatAgg(Feat, ColorFeat, Inp = df).sort_values(by = [ColorFeat])
PlotX(Table, Feat, ColorFeat, Target = Target, yLim = [0, 20], Colors = ['YellowGreen','ForestGreen','Gray','Tomato'])
Feat = 'Credit Amount'
DistPlot(Feat = Feat, Colors = ['LightCoral', 'LimeGreen'], yLim = [0, 400])
DispPlot2(Feat = 'Credit Amount', ColorFeat = 'Sex', xLim = [0 , 22e3], yLim = [0 , 350], nbins = 10,
Colors1 = ['DeepPink', 'RoyalBlue'], Colors2 = ['LightCoral', 'LimeGreen'], Inp = Data)
ColorFeat = 'Saving Accounts'
Table = FeatAgg(Feat, ColorFeat, Inp = df).sort_values(by = [ColorFeat])
PlotX(Table, Feat, ColorFeat, Target = Target, yLim = [0, 30], Colors = ['YellowGreen','ForestGreen','Gray', 'Coral','Tomato'])
ColorFeat = 'Checking Account'
Table = FeatAgg(Feat, ColorFeat, Inp = df).sort_values(by = [ColorFeat])
PlotX(Table, Feat, ColorFeat, Target = Target, yLim = [0, 30], Colors = ['YellowGreen','ForestGreen','Gray','Tomato'])
Feat = 'Duration'
DistPlot(Feat = Feat, Colors = ['LightCoral', 'LimeGreen'], yLim = [0, 500], nbins = 10)
ColorFeat = 'Saving Accounts'
Table = FeatAgg(Feat, ColorFeat, Inp = df).sort_values(by = [ColorFeat])
PlotX(Table, Feat, ColorFeat, Target = Target, yLim = [0, 14], Colors = ['YellowGreen','ForestGreen','Gray', 'Coral','Tomato'])
ColorFeat = 'Checking Account'
Table = FeatAgg(Feat, ColorFeat, Inp = df).sort_values(by = [ColorFeat])
PlotX(Table, Feat, ColorFeat, Target = Target, yLim = [0, 10], Colors = ['YellowGreen','ForestGreen','Gray','Tomato'])
Feat = 'Saving Accounts'
ColorFeat = 'Purpose'
Table = FeatAgg(Feat, ColorFeat, Inp = Data).sort_values(by = [ColorFeat])
PlotX(Table, Feat, ColorFeat, Target = Target, yLim = [0, 14])
ColorFeat = 'Checking Account'
Table = FeatAgg(Feat, ColorFeat, Inp = df).sort_values(by = [ColorFeat])
PlotX(Table, Feat, ColorFeat, Target = Target, yLim = [0, 20], Colors = ['YellowGreen','ForestGreen','Gray','Tomato'])